Rui Huang
You are a data scientist in healthcare. Your manager gives you two Coronavirus tweets datasets.
Tweet_texts:¶import pandas as pd
final_train = pd.read_csv('../datasets/Corona_NLP_train.csv', encoding='latin-1')
cols = ['Location', 'TweetAt', 'OriginalTweet']
# We need to fill NaN as empty string
final_train.Location = final_train.Location.fillna('')
final_train['Tweet_texts'] = final_train[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
final_train.head(5)
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | Tweet_texts | |
|---|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... | Neutral | London 16-03-2020 @MeNyrbie @Phil_Gahan @Chris... |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice Talk to your neighbours family to excha... | Positive | UK 16-03-2020 advice Talk to your neighbours f... |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | Coronavirus Australia: Woolworths to give elde... | Positive | Vagabonds 16-03-2020 Coronavirus Australia: Wo... |
| 3 | 3802 | 48754 | 16-03-2020 | My food stock is not the only one which is emp... | Positive | 16-03-2020 My food stock is not the only one ... | |
| 4 | 3803 | 48755 | 16-03-2020 | Me, ready to go at supermarket during the #COV... | Extremely Negative | 16-03-2020 Me, ready to go at supermarket dur... |
Tweet_texts:¶import re
text_array = []
final_sample = final_train.sample(frac=0.01, random_state=100)
for tweet in final_sample.Tweet_texts:
text_array.append(tweet)
text_array[-10]
"Atlanta, y'all 19-03-2020 Â\x93the food supply is sufficient.Â\x94 | #Coronavirus has thrown a wrench into the #food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. Â\x93This is war.Â\x94 https://t.co/9ro9XWsSf3 via @WSJ #cre"
for index, text in enumerate(text_array):
text_array[index] = re.sub(r'@(\w){1,15}(:)?', '', text)
text_array[-10]
"Atlanta, y'all 19-03-2020 Â\x93the food supply is sufficient.Â\x94 | #Coronavirus has thrown a wrench into the #food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. Â\x93This is war.Â\x94 https://t.co/9ro9XWsSf3 via #cre"
for index, text in enumerate(text_array):
text_array[index] = re.sub(r'[^\x00-\x7f]', r'', text)
text_array[-10]
"Atlanta, y'all 19-03-2020 the food supply is sufficient. | #Coronavirus has thrown a wrench into the #food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. This is war. https://t.co/9ro9XWsSf3 via #cre"
for index, text in enumerate(text_array):
text_array[index] = re.sub('https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}', '', text)
text_array[-10]
"Atlanta, y'all 19-03-2020 the food supply is sufficient. | #Coronavirus has thrown a wrench into the #food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. This is war. via #cre"
for index, text in enumerate(text_array):
text_array[index] = re.sub(r'[0-9]{2}-[0-9]{2}-[0-9]{4}', '', text)
text_array[-10]
"Atlanta, y'all the food supply is sufficient. | #Coronavirus has thrown a wrench into the #food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. This is war. via #cre"
for index, text in enumerate(text_array):
text_array[index] = re.sub('\#', '', text)
text_array[-10]
"Atlanta, y'all the food supply is sufficient. | Coronavirus has thrown a wrench into the food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. This is war. via cre"
for index, text in enumerate(text_array):
text_array[index] = re.sub("'", '', text)
text_array[-10]
'Atlanta, yall the food supply is sufficient. | Coronavirus has thrown a wrench into the food supply chain. Companies scramble to fill supermarket shelves as quickly as consumers empty them. This is war. via cre'
for index, text in enumerate(text_array):
text_array[index] = re.sub('\W+', ' ', text)
text_array[-10]
'Atlanta yall the food supply is sufficient Coronavirus has thrown a wrench into the food supply chain Companies scramble to fill supermarket shelves as quickly as consumers empty them This is war via cre'
for index, text in enumerate(text_array):
text_array[index] = re.sub(r'\b[a-zA-Z]{1,2}\b', '', text)
text_array[-10]
'Atlanta yall the food supply sufficient Coronavirus has thrown wrench into the food supply chain Companies scramble fill supermarket shelves quickly consumers empty them This war via cre'
for index, text in enumerate(text_array):
text_array[index] = re.sub('\W+', ' ', text)
text_array[-10]
'Atlanta yall the food supply sufficient Coronavirus has thrown wrench into the food supply chain Companies scramble fill supermarket shelves quickly consumers empty them This war via cre'
for text in text_array:
if text == '':
text_array.remove(text)
len(text_array)
412
import spacy
from nltk.tokenize import word_tokenize
nlp = spacy.load('en_core_web_sm')
all_stopwords = nlp.Defaults.stop_words
all_words = ' '.join(text_array)
text_tokens = word_tokenize(all_words)
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
all_text = ' '.join(tokens_without_sw)
nlp.max_length = 10000000
doc = nlp(all_text)
Q2_df = pd.DataFrame(columns = ["text", "Lemma", "Pos", "Tag", "Dep", "Shape", "Is alpha", "Is stop"])
for token in doc:
Q2_df.loc[-1] = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
Q2_df.index = Q2_df.index + 1
Q2_df.head()
| text | Lemma | Pos | Tag | Dep | Shape | Is alpha | Is stop | |
|---|---|---|---|---|---|---|---|---|
| 7920 | pause | pause | NOUN | NN | nmod | xxxx | True | False |
| 7919 | economic | economic | ADJ | JJ | amod | xxxx | True | False |
| 7918 | activity | activity | NOUN | NN | compound | xxxx | True | False |
| 7917 | dent | dent | NOUN | NN | compound | xxxx | True | False |
| 7916 | rubber | rubber | NOUN | NN | compound | xxxx | True | False |
from spacy import displacy
options = { "compact": True, "color": "white", "bg": "#09a3d5", "font": "Source Sans Pro" }
displacy.render(doc, style="dep", options=options, jupyter=True)
Q2_df_2 = pd.DataFrame(columns = ["Text", "Start Character", "End Character", "Label"])
for entity in doc.ents:
Q2_df_2 = Q2_df_2.append({"Text": entity.text, "Start Character": entity.start_char, "End Character": entity.end_char, "Label": entity.label_}, ignore_index=True)
Q2_df_2.head(5)
| Text | Start Character | End Character | Label | |
|---|---|---|---|---|
| 0 | Canada | 43 | 49 | GPE |
| 1 | Sheds | 486 | 491 | NORP |
| 2 | Austin | 556 | 562 | GPE |
| 3 | Texas | 563 | 568 | GPE |
| 4 | Philippines | 775 | 786 | GPE |
options = {"ents": ["GPE", "MONEY", "QUANTITY"]}
displacy.render(doc, style="ent", options=options)
Tweet_texts.¶tokens_without_sw
['pause', 'economic', 'activity', 'dent', 'rubber', 'prices', 'Canada', 'Remember', 'thank', 'working', 'essential', 'services', 'responders', 'grocery', 'store', 'pharmacy', 'staff', 'taxi', 'bus', 'drivers', 'police', 'officer', 'firefighter', 'utility', 'worker', 'Thank', 'You', 'covid19', 'coronavirus', 'USA', 'You', 'didnt', 'read', 'post', 'carefully', 'The', 'COVID', '19', 'impacts', 'real', 'But', 'drop', 'stock', 'prices', 'doesnt', 'necessarily', 'translate', 'drop', 'demand', 'majority', 'everyday', 'buyers', 'Why', 'youre', 'talking', 'Cannes', 'France', 'Covid', '19', 'Blackstone', 'buys', 'logistics', 'assets', 'demand', 'soars', 'online', 'shopping', 'health', 'crisis', 'Sheds', 'rainbow', 'curved', 'air', 'kiev', 'Hey', 'love', 'talk', 'workers', 'Workers', 'fill', 'form', 'Austin', 'Texas', 'lot', 'stores', 'closed', 'coronavirus', 'youre', 'online', 'shopping', 'dont', 'forget', 'research', 'youre', 'buying', 'Try', 'purchase', 'brands', 'making', 'environmentally', 'friendly', 'decisions', 'sustainabilitytips', 'Sustainability', 'onlineshopping', 'Republic', 'Philippines', 'support', 'government', 'efforts', 'fight', 'spread', 'COVID', '19', 'City', 'Paranaque', 'closed', 'notice', 'However', 'Supermarket', 'Watsons', 'BDO', 'Carpark', 'remain', 'open', 'let', 'fight', 'COVID', '19', 'London', 'First', 'time', 'supermarket', '10', 'days', 'shelves', 'like', 'CoronaCrisis', 'Nagpur', 'India', 'Why', 'Government', 'transmitting', 'benefits', 'fall', 'Cruds', 'oil', 'prices', 'mean', 'big', 'slash', 'atleast', 'bring', 'like', 'petrol', 'prices', '60', 'litre', 'crudeoil', 'PetrolPrice', 'coronavirusindia', 'COVID2019', 'probably', 'local', 'lately', 'wondered', 'What', 'happened', 'sign', 'The', 'End', 'The', 'taken', 'lives', 'caused', 'lose', 'PRESENTED', 'Living', 'Waters', 'Wellington', 'New', 'Zealand', 'Heres', 'info', 'bank', 'help', 'Covid', '19', 'Fremont', 'childhood', 'friend', 'says', 'Vermonters', 'panic', 'buying', 'food', 'guns', 'ammo', 'Now', 'theres', 'barometer', 'showing', 'high', 'pressure', 'coronavirus', 'When', 'actually', 'score', 'grocery', 'store', 'Covid_19', 'SocialDistancing', 'SundayMotivation', 'Los', 'Angeles', 'London', 'Advertising', 'Agencies', 'need', 'message', 'There', 'plenty', 'food', 'Stop', 'panic', 'GLIUMEDIA', 'Westport', 'New', 'York', 'What', 'brands', 'Amazon', 'expect', 'right', 'Our', 'Marketplace', 'Strategy', 'Rina', 'Yashayeva', 'describes', 'consumer', 'search', 'behavior', 'changed', 'Amazon', 'reacting', 'COVID', '19', 'Amazon', 'COVID19', 'CAN', 'AFRICA', 'WORK', 'LINE', 'new', 'survey', 'shows', '80', 'people', 'asked', 'work', 'home', 'Covid', '19', 'productive', 'They', 'spending', 'time', 'social', 'media', 'entertainment', 'shopping', 'working', 'Africa', 'ready', 'work', 'online', 'Liverpool', 'England', 'work', 'supermarket', 'Wednesday', 'ANYONE', 'shopping', 'essentials', 'trolleys', 'getting', 'punch', 'throat', 'COVIDIDIOTS', 'CoronavirusUK', 'CoronavirusBillUK', 'Coronavirus', 'UKlockdown', 'UKLockedDown', 'CornavirusOutbreak', 'Moscow', 'Coronavirus', 'isnt', 'wreaking', 'havoc', 'health', 'poised', 'feed', 'Middle', 'East', 'unrest', 'possibly', 'terrorism', 'Experts', 'agree', 'punch', 'COVID19', 'resulting', 'lower', 'oil', 'prices', 'exacerbate', 'problems', 'Iran', 'Iraq', 'Havant', 'Constituency', '3rd', 'parties', 'pushing', 'price', 'ffp2', 'masks', '5x', 'people', 'need', 'This', 'criminal', 'offense', 'codice', 'penale', 'art', '501bis', 'decent', 'human', 'beings', 'prices', 'moderated', 'update', 'yesterday', 'base', 'rent', 'Malaysia', 'Singapore', 'walking', 'supermarket', 'food', 'run', 'thinking', 'prolong', 'sweet', 'ecstacy', 'browsing', 'aisles', 'watching', 'humans', 'afar', 'like', 'zoo', 'SocialDistancing', 'Covid_19', 'Barcelona', 'Home', 'supermarket', '116mx2', 'Nobody', 'street', 'Goooo', 'power', 'live', 'life', 'covid', '19', 'coronavirus', 'alltogether', 'barcelona', 'santpau', 'gaudi', 'Berkeley', 'The', 'biggest', 'airlines', 'spent', '96', 'free', 'cash', 'flow', 'decade', 'buy', 'shares', 'stock', 'order', 'boost', 'executive', 'bonuses', 'wealthy', 'investors', 'Now', 'expect', 'taxpayers', 'bail', 'tune', '50', 'billion', 'Its', 'old', 'story', 'London', 'coronavirus', 'pushes', 'carbon', 'price', '16', 'month', 'low', 'Naivas', 'Supermarket', 'rewards', 'Baringo', 'Commander', 'Ibrahim', 'Abajila', 'officer', 'Amina', 'Mutio', 'Ramadhan', 'sets', 'gift', 'vouchers', 'exemplary', 'service', 'enforcement', 'curfew', 'extends', 'temporary', 'store', 'closures', 'furloughs', 'workforce', 'retail', 'steinmart', 'coronavirus', 'housewares', 'homeworld', 'youre', 'telling', 'work', 'supermarket', 'putting', 'literal', 'fucking', 'life', 'line', 'doctor', 'nurse', 'covid', '19', 'shit', 'right', 'minimum', 'wage', 'What', 'kind', 'fuckery', 'Mxico', 'Cattle', 'gridlock', 'border', 'delays', 'add', 'coronavirus', 'strain', 'meat', 'trade', 'Washington', 'China', 'showing', 'signs', 'recovery', 'However', 'recent', 'QuickTake', 'pointed', 'recovery', 'hampered', 'shutdown', 'business', 'travel', 'events', 'slowing', 'consumer', 'demand', 'internationally', 'virus', 'hits', 'rest', 'world', 'covid_19', 'Canada', 'Key', 'Food', 'Prices', 'Are', 'Surging', 'After', 'Virus', 'Upends', 'Supply', 'Chains', 'USA', 'Australia', 'Subsistence', 'miners', 'lose', 'CoronaVirus', 'crushes', 'local', 'gold', 'prices', 'think', '28', 'Days', 'Later', 'scene', 'supermarket', 'shelves', 'nicely', 'stacked', 'coronavirus', 'Earth', 'working', 'till', 'Friday', 'teaching', 'online', 'Then', 'spend', 'time', 'sister', 'grocery', 'store', 'Covid_19', 'coronavirus', 'Quarantine', 'grocery', 'store', 'chains', 'food', 'companies', 'audit', 'products', 'stock', 'sell', 'folks', 'leaving', 'products', 'shelf', 'time', 'like', 'food', 'industry', 'probably', 'isnt', 'calling', 'coronavirus', 'MENA', 'Perceived', 'danger', 'covid19', 'high', 'UAE', 'downplayed', 'slightly', 'youth', 'westerners', 'The', 'majority', 'feel', 'state', 'coronavirus', 'crisis', 'concern', 'higher', 'youth', 'locals', 'Download', 'report', 'Minas', 'Tirith', 'Interspar', 'local', 'supermarket', 'asking', 'customers', 'limit', 'purchases', '5kg', '10', 'pieces', 'item', 'Bangalore', 'INDIA', 'Consumers', 'needs', 'different', 'stakeholders', 'Covid', '19', 'crisis', 'wonder', 'similar', 'different', 'needs', 'Indian', 'Healthcare', 'Consumer', 'context', 'Tesco', 'latest', 'supermarket', 'create', 'jobs', 'meet', 'surge', 'coronavirus', 'COVID19', 'COVID', '19', 'Covid_19', 'corona', 'CoronavirusOutbreak', 'prepper', 'survival', 'bushcraft', 'follow', 'Huntington', 'Beach', 'Due', 'Covid', '19', 'Virus', 'BASEBALL', 'CARDS', 'PLUS', 'retail', 'location', 'close', '2', 'weeks', 'SUPPORT', 'effort', 'wipe', 'faster', 'taking', 'store', 'phone', 'orders', 'shipped', 'FREE', 'period', 'Plus', 'Ebay', 'Store', 'offering', 'day', 'service', 'Londonderry', 'Northern', 'Ireland', 'THE', 'MARXIST', 'NANNYSTATE', 'Police', 'Officer', 'Orders', 'Family', 'Back', 'Inside', 'For', 'Playing', 'Their', 'Yard', 'Others', 'PROWL', 'Supermarket', 'Aisles', 'Looking', 'For', 'Shoppers', 'Buying', 'Non', 'Essential', 'Items', 'CommonPurpose', 'Soros', 'Rothschild', 'coronavirus', 'HappyEaster', 'Wichita', 'Kansas', 'Meitzner', 'says', 'Sedgwick', 'legal', 'department', 'working', 'grocery', 'stores', 'possible', 'designated', 'shopping', 'hours', 'vulnerable', 'populations', 'ensuring', 'theres', 'hand', 'sanitizer', 'etc', 'employees', 'kakenews', 'coronavirus', 'covid19', 'New', 'Jersey', 'USA', 'week', 'important', 'stay', 'home', 'Birx', 'said', 'skip', 'drug', 'store', 'grocery', 'Covid_19', 'COVID19', 'abc7ny', 'ABCNews', 'CoronaCrisis', 'Covid_19', 'weird', 'eerie', 'scene', 'England', 'Deserted', 'streets', 'buses', 'lots', 'seats', 'trains', 'bare', 'supermarket', 'shelves', 'shuttered', 'shops', 'Even', 'Jacket', 'potato', 'stall', 'shut', 'people', 'walking', 'clutching', 'shopping', 'bags', 'unusual', 'items', 'Dublin', 'holidaytrip', 'months', 'Should', 'cancel', 'cancel', 'late', 'pay', 'higher', 'fees', 'Know', 'consumerrights', 'airtravel', 'seatravel', 'amp', 'packageholidays', 'COVID2019IRELAND', 'Irishconsumers', 'Hoboken', 'Over', 'week', 'span', '25', '000', 'meals', 'distributed', 'volunteers', 'nearly', '600', 'Hoboken', 'seniors', 'help', 'indoors', 'reduce', 'trips', 'outside', 'home', 'coming', 'protect', 'Hobokens', 'vulnerable', 'population', 'COVID', '19', 'remain', 'HobokenStrong', 'Day', '7', 'isolation', 'Supermarket', 'shelves', 'Its', 'time', 'FAST', 'FOOD', 'ChinaVirus', 'coronavirus', 'isolationandchill', 'England', 'United', 'Kingdom', 'Coronavirus', 'Economics', 'Hiding', 'Behind', 'The', 'CoronaVirus', 'veil', 'The', 'Massive', 'Preparation', 'shadows', 'The', 'New', 'Currency', 'Shock', 'Digital', 'Dollar', 'Proposals', 'Set', 'Bitcoin', 'And', 'Crypto', 'Prices', 'Alight', 'Singapore', 'Lock', 'Freeze', 'bills', 'rents', 'mortgages', 'Give', '1000', 'month', 'months', '60bil', 'Continue', 'food', 'supply', 'chains', 'army', 'reservists', 'volunteers', 'deliver', 'online', 'food', 'orders', 'Woolworths', 'Coles', 'etc', 'national', 'operation', 'Coronaaustralia', 'Boston', 'Who', 'list', 'emergency', 'workers', 'days', 'coronavirus', 'health', 'crisis', 'Did', 'remember', 'add', 'grocery', 'store', 'workers', 'COVID19', 'coronavirus', 'Singapore', 'COVID', '19', 'limited', 'shopping', 'brick', 'mortar', 'stores', 'online', 'shopping', 'story', 'unexpected', 'catalyst', 'tech', 'adoption', 'Atlanta', 'sorry', 'inconvenience', 'You', 'minimize', 'impact', 'COVID', '19', 'credit', 'Talking', 'lenders', 'creditors', 'Paying', 'Staying', 'date', 'credit', 'reports', 'Considering', 'adding', 'consumer', 'state', 'New', 'York', 'The', 'best', 'credit', 'cards', 'maximizing', 'purchases', 'grocery', 'store', 'Covid', '19', 'Lockdown', 'Know', 'Your', 'Online', 'Grocery', 'Shopping', 'Options', 'Manchester', 'great', 'news', 'isolation', 'supermarket', 'deliver', 'food', 'week', 'Some', '3', 'weeks', 'And', 'people', 'shouldnt', 'stock', 'glad', 'support', 'friends', 'family', 'scary', 'didnt', 'cfprobs', 'isolation', 'coronavirus', 'London', 'New', 'announcements', 'running', 'tube', 'stations', 'All', 'TfL', 'services', 'solely', 'focussed', 'ensuring', 'critical', 'workers', 'needed', 'You', ...]
Tweet_texts column with the string only containing the lemmas of all the tokens.¶stop_words = spacy.lang.en.stop_words.STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
def clean_text(text):
text = re.sub(r'@(\w){1,15}(:)?', '', text)
text = re.sub(r'[^\x00-\x7f]', r'', text)
text = re.sub('https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}', '', text)
text = re.sub(r'[0-9]{2}-[0-9]{2}-[0-9]{4}', '', text)
text = re.sub('\#', '', text)
text = re.sub("'", '', text)
text = re.sub('\W+', ' ', text)
text = re.sub(r'\b[a-zA-Z]{1,2}\b', '', text)
text = stemmer.stem(text)
text = [ word.lemma_ for word in nlp(text) if word not in stop_words]
return ' '.join(text).strip().lower()
final_sample['Tweet_texts_original'] = final_sample['Tweet_texts']
final_sample['Tweet_texts'] = final_sample['Tweet_texts'].apply(clean_text)
final_sample.head()
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | Tweet_texts | Tweet_texts_original | |
|---|---|---|---|---|---|---|---|---|
| 39724 | 43523 | 88475 | 13-04-2020 | A pause in economic activity could dent rubber... | Neutral | pause economic activity could dent rubber pr... | 13-04-2020 A pause in economic activity could... | |
| 23511 | 27310 | 72262 | Canada | 26-03-2020 | Remember to say thank you to all those working... | Extremely Positive | canada remember say thank you all those wo... | Canada 26-03-2020 Remember to say thank you to... |
| 7136 | 10935 | 55887 | USA | 19-03-2020 | @David_desJ You didnÂt read my first post car... | Extremely Negative | usa you do nt read first post carefully the ... | USA 19-03-2020 @David_desJ You didnÂt read my... |
| 18484 | 22283 | 67235 | Cannes, France | 23-03-2020 | Covid-19: Blackstone buys logistics assets as ... | Negative | cannes france covid 19 blackstone buy logistic... | Cannes, France 23-03-2020 Covid-19: Blackstone... |
| 20459 | 24258 | 69210 | A rainbow in curved air | 25-03-2020 | in kiev Hey We at would love to talk to you an... | Extremely Positive | rainbow curve air kiev hey would love ... | A rainbow in curved air 25-03-2020 in kiev Hey... |
x = final_train.Sentiment.value_counts().sort_values().plot(kind="bar", figsize=(18, 10))
Tweet_texts using.¶final_sample["len_text_bc"] = final_sample['Tweet_texts_original'].str.len()
final_sample["len_text_ac"] = final_sample['Tweet_texts'].str.len()
import seaborn as sns
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize = (17.8, 10))
fig.suptitle('Boxplot of the Length of Texts', fontsize= 24)
# Plot the texts before cleaning
sns.boxplot(ax=axes[0], x=final_sample["len_text_bc"])
axes[0].set_title("Before Clean and Preprocess Texts", fontsize= 20)
# Plot the texts after cleaning
sns.boxplot(ax=axes[1], x=final_sample["len_text_ac"])
x= axes[1].set_title("After Clean and Preprocess Texts", fontsize= 20)
fig, axes = plt.subplots(1, 2, figsize = (17.8, 10))
fig.suptitle('Histogram of the Length of Texts', fontsize= 24)
# Plot the texts before cleaning
sns.histplot(ax=axes[0], data = final_sample, x="len_text_bc")
axes[0].set_title("Before Clean and Preprocess Texts", fontsize= 20)
# Plot the texts after cleaning
sns.histplot(ax=axes[1], data = final_sample, x="len_text_ac")
x= axes[1].set_title("After Clean and Preprocess Texts", fontsize= 20)
fig, axes = plt.subplots(1, 2, figsize = (17.8, 10))
fig.suptitle('Kernel Density of the Length of Texts', fontsize= 24)
sns.kdeplot(ax=axes[0], data = final_sample, x="len_text_bc")
axes[0].set_title("Before Clean and Preprocess Texts", fontsize= 20)
sns.kdeplot(ax=axes[1], data = final_sample, x="len_text_ac")
x= axes[1].set_title("After Clean and Preprocess Texts", fontsize= 20)
final_sample["count_word_bc"] = final_sample['Tweet_texts_original'].str.split().str.len()
final_sample["count_word_ac"] = final_sample['Tweet_texts'].str.split().str.len()
fig, axes = plt.subplots(1, 2, figsize = (17.8, 10))
fig.suptitle('Boxplot of the Word Count', fontsize= 24)
sns.boxplot(ax=axes[0], x=final_sample["count_word_bc"])
axes[0].set_title("Before Clean and Preprocess Texts", fontsize= 20)
sns.boxplot(ax=axes[1], x=final_sample["count_word_ac"])
x = axes[1].set_title("After Clean and Preprocess Texts", fontsize= 20)
fig, axes = plt.subplots(1, 2, figsize = (17.8, 10))
fig.suptitle('Histogram of the Word Count', fontsize= 24)
sns.histplot(ax=axes[0], data = final_sample, x="count_word_bc")
axes[0].set_title("Before Clean and Preprocess Texts", fontsize= 20)
sns.histplot(ax=axes[1], data = final_sample, x="count_word_ac")
x= axes[1].set_title("After Clean and Preprocess Texts", fontsize= 20)
fig, axes = plt.subplots(1, 2, figsize = (17.8, 10))
fig.suptitle('Kernel Density of the Word Count', fontsize= 24)
sns.kdeplot(ax=axes[0], data = final_sample, x="count_word_bc")
axes[0].set_title("Before Clean and Preprocess Texts", fontsize= 20)
sns.kdeplot(ax=axes[1], data = final_sample, x="count_word_ac")
x= axes[1].set_title("After Clean and Preprocess Texts", fontsize= 20)
Tweet_texts.¶import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def get_top_tf_idf_words(df = final_sample, col = "Tweet_texts", use_idf = True, ngram_range =(1, 1), top_n= 5):
tf_idf = TfidfVectorizer(stop_words='english', ngram_range = ngram_range, use_idf = use_idf)
X_sparse_matrix = tf_idf.fit_transform(df[col])
feature_names = np.array(tf_idf.get_feature_names_out())
tf_idf_sparse_matrix = tf_idf.transform(df[col])
sorted_idx = np.argsort(tf_idf_sparse_matrix.data)[:-(top_n+1):-1]
return pd.DataFrame(
{'feature': feature_names[tf_idf_sparse_matrix.indices[sorted_idx]], # based on fancy indexing of Python
'tf_idf': tf_idf_sparse_matrix.data[sorted_idx], # based on fancy indexing of Python
})
top_n = 10
# Generate top tf-idf words for the texts before and after cleaning
df_text_bc = get_top_tf_idf_words(df = final_sample, col = "Tweet_texts_original", top_n= top_n)
df_text_ac = get_top_tf_idf_words(df = final_sample, col = "Tweet_texts", top_n= top_n)
x=range(0, top_n)
fig, ax = plt.subplots(2, 1, figsize = (17.8, 10))
fig.suptitle('Top Unigrams of the TF-IDF', fontsize= 24)
ax[0].plot(x, df_text_bc.tf_idf, 'bo')
ax[0].set_title(' Before Clean the Texts', fontsize= 20)
ax[0].set_xticks(x)
x_ticks_labels = df_text_bc.feature
ax[0].set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16)
ax[1].plot(x, df_text_ac.tf_idf, 'bo')
ax[1].set_title(' After Clean the Texts', fontsize= 20)
ax[1].set_xticks(x)
x_ticks_labels = df_text_ac.feature
ax[1].set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16)
fig.subplots_adjust(hspace=1.5)
plt.show()
import scattertext as st
corpus_bc = st.CorpusFromPandas(
final_sample,
category_col='Sentiment',
text_col='Tweet_texts_original',
nlp=nlp
).build()
corpus_ac = st.CorpusFromPandas(
final_sample,
category_col='Sentiment',
text_col='Tweet_texts',
nlp=nlp
).build()
term_freq_df_bc = corpus_bc.get_term_freq_df()
term_freq_df_bc['Positive Score'] = corpus_bc.get_scaled_f_scores('Positive')
term_freq_df_bc_freq = term_freq_df_bc.sort_values(by= 'Positive freq', ascending=False)
term_freq_df_bc_Score = term_freq_df_bc.sort_values(by= 'Positive Score', ascending=False)
term_freq_df_ac = corpus_ac.get_term_freq_df()
term_freq_df_ac['Positive Score'] = corpus_ac.get_scaled_f_scores('Positive')
term_freq_df_ac_freq = term_freq_df_ac.sort_values(by= 'Positive freq', ascending=False)
term_freq_df_ac_Score = term_freq_df_ac.sort_values(by= 'Positive Score', ascending=False)
term_freq_df_ac_freq = term_freq_df_ac_freq.head(10)
term_freq_df_bc_freq = term_freq_df_bc_freq.head(10)
x = range(0, top_n)
fig, ax = plt.subplots(2, 1, figsize = (17.8, 10))
fig.suptitle('Top 10 Frequency of Tokens Associated with Positive Review', fontsize= 24)
ax[0].plot(x, term_freq_df_bc_freq["Positive freq"], 'bo')
ax[0].set_title(' Before Clean the Texts', fontsize= 20)
ax[0].set_xticks(x)
x_ticks_labels = term_freq_df_bc_freq.index
ax[0].set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16)
ax[1].plot(x, term_freq_df_ac_freq["Positive freq"], 'bo')
ax[1].set_title(' After Clean the Texts', fontsize= 20)
ax[1].set_xticks(x)
x_ticks_labels = term_freq_df_ac_freq.index
ax[1].set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16)
fig.subplots_adjust(hspace=1.5)
plt.show()
term_freq_df_ac_Score = term_freq_df_ac_Score.head(10)
term_freq_df_bc_Score = term_freq_df_bc_Score.head(10)
x = range(0, top_n)
fig, ax = plt.subplots(2, 1, figsize = (17.8, 10))
fig.suptitle('Top 10 Positive Scores of Tokens', fontsize= 24)
ax[0].plot(x, term_freq_df_bc_Score["Positive Score"], 'bo')
ax[0].set_title(' Before Clean the Texts', fontsize= 20)
ax[0].set_xticks(x)
x_ticks_labels = term_freq_df_bc_Score.index
ax[0].set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16)
ax[1].plot(x, term_freq_df_ac_Score["Positive Score"], 'bo')
ax[1].set_title(' After Clean the Texts', fontsize= 20)
ax[1].set_xticks(x)
x_ticks_labels = term_freq_df_ac_Score.index
ax[1].set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=16)
fig.subplots_adjust(hspace=1.5)
plt.show()
Tweet_texts to a matrix of token counts using CountVectorizer and unigrams and bigrams.¶from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))
token_count_matrix = vectorizer.fit_transform(final_sample['Tweet_texts'])
print(f'The size of the feature matrix for the texts = {token_count_matrix.get_shape()}')
print(f'The first row of the feature matrix = {token_count_matrix[0, ]}.')
print(f'There are {token_count_matrix[0, ].count_nonzero()}/{token_count_matrix.get_shape()[1]} non-zeros')
The size of the feature matrix for the texts = (412, 11666) The first row of the feature matrix = (0, 7367) 1 (0, 3320) 1 (0, 389) 1 (0, 2599) 1 (0, 2993) 1 (0, 8386) 1 (0, 7749) 1 (0, 3715) 1 (0, 7368) 1 (0, 3321) 1 (0, 391) 1 (0, 2602) 1 (0, 2995) 1 (0, 8387) 1 (0, 7768) 1. There are 15/11666 non-zeros
Tweet_texts using Tfidfvectorizer.¶from sklearn.feature_extraction.text import TfidfTransformer
tf_idf_transformer=TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=False)
tf_idf_matrix = tf_idf_transformer.fit_transform(token_count_matrix)
print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')
print(f'The sparse tf_idf matrix is as follows:')
print(tf_idf_matrix)
The size of the tf_idf matrix for the texts = (412, 11666) The sparse tf_idf matrix is as follows: (0, 8387) 0.2842318669248711 (0, 8386) 0.2842318669248711 (0, 7768) 0.2842318669248711 (0, 7749) 0.11916516625039764 (0, 7368) 0.2842318669248711 (0, 7367) 0.2660263952817386 (0, 3715) 0.22798253685323203 (0, 3321) 0.2660263952817386 (0, 3320) 0.22798253685323203 (0, 2995) 0.2842318669248711 (0, 2993) 0.2660263952817386 (0, 2602) 0.2842318669248711 (0, 2599) 0.21196775420525074 (0, 391) 0.2842318669248711 (0, 389) 0.24309021354356458 (1, 11560) 0.13798609592877528 (1, 11551) 0.12914788231199636 (1, 11549) 0.1179674770492143 (1, 11436) 0.13798609592877528 (1, 11413) 0.07895678444504205 (1, 11393) 0.13798609592877528 (1, 11388) 0.08207605611489964 (1, 10794) 0.13798609592877528 (1, 10793) 0.13798609592877528 (1, 10342) 0.13798609592877528 : : (411, 7488) 0.15327337585643078 (411, 7007) 0.1854595479406663 (411, 7002) 0.14484512069763583 (411, 6985) 0.1854595479406663 (411, 6975) 0.12844999182703026 (411, 6767) 0.1854595479406663 (411, 6748) 0.10612148601975521 (411, 6102) 0.1854595479406663 (411, 6096) 0.1195031900979706 (411, 4517) 0.09000671446932029 (411, 4509) 0.08485362514271744 (411, 4385) 0.1854595479406663 (411, 4384) 0.15861487172550595 (411, 3466) 0.1854595479406663 (411, 3465) 0.1854595479406663 (411, 2511) 0.1854595479406663 (411, 2448) 0.05671485960372425 (411, 1317) 0.15861487172550595 (411, 1248) 0.06478363643178943 (411, 955) 0.173580589477946 (411, 952) 0.1355153446734897 (411, 813) 0.1854595479406663 (411, 714) 0.05314383943095999 (411, 584) 0.1854595479406663 (411, 582) 0.15861487172550595
Tweet_texts between the 200th and 20,000th tweets.¶from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tf_idf_matrix, dense_output=True)
print(f"The cosine similarity between '{final_sample.loc[final_sample.index[1], 'Tweet_texts']}' and '{final_sample.loc[final_sample.index[199], 'Tweet_texts']}' is {cos_sim[1,199]}")
The cosine similarity between 'canada remember say thank you all those work essential service the first responder the grocery store and pharmacy staff the taxi and bus driver the police officer and firefighter the utility worker there be many thank you covid19 coronavirus' and 'worldwide what lockdown rule around here people be drive around day and night the supermarket full and fail count number keep social distancing rule covid19pandemic coronavirus lockdown glasgow' is 0.020602255377710554
vector_array = []
for tweet in final_sample:
tweet_vector = nlp(tweet).vector
vector_array.append(tweet_vector)
vector_np_array = np.array(vector_array)
print(f"Corpus vector equal to the average of all the document vector is: \n{vector_np_array.mean(axis=0)}")
Corpus vector equal to the average of all the document vector is: [ 0.0565193 -0.49414828 -0.10164774 0.00322362 0.36567828 -0.6944256 -0.6884574 -0.72666305 -0.49822557 -0.11770191 0.73163146 -0.20141864 -0.07235938 0.31260857 -0.09462584 0.13198455 -0.04289513 0.02573293 -0.41180518 -0.39983496 0.00862799 -0.6349558 -0.50158864 0.04183084 -0.37459317 -0.14987499 0.14596067 -0.29919383 0.19768731 -0.35839328 -0.07461043 0.1899933 0.21382397 -0.48393226 0.54218155 0.5379165 -0.6085499 0.7726226 0.3184404 0.26572677 -0.26792827 0.42801246 -0.23268329 1.1774791 0.13425027 -0.28043184 0.3391191 -0.00379457 -0.43437925 -0.02607446 0.30645403 0.13425344 -0.0866674 -0.4868264 -0.03650158 -0.4598355 0.12153511 0.32877815 0.38710657 -0.33590257 0.7050007 -0.29092863 0.36540183 0.13695559 0.03570491 -0.5195349 -0.43508193 -0.36315703 0.46987548 0.3491594 0.4746903 -0.16927962 -0.4517566 -0.34064242 -0.03688206 0.6561437 0.56048626 -0.81480414 -0.38615215 -0.03461851 0.16246955 0.17831467 0.22751991 0.9142025 -0.7658522 -0.07577757 0.26036188 -0.7841981 0.09419399 0.16643332 -0.5678156 0.8421564 0.12429657 0.86047083 -0.17233676 -0.3721137 ]
def spacy_tokenizer(sentence):
mytokens = nlp(sentence)
mytokens = [ word.lemma_ for word in mytokens if word not in stop_words]
return mytokens
from sklearn.base import TransformerMixin
class features(TransformerMixin):
def transform(self, X, **transform_params):
return [clean_text(text) for text in X]
def fit(self, X, y= None, **fit_params):
return self
def get_params(self, deep= True):
return {}
from sklearn.model_selection import train_test_split
X = final_sample['Tweet_texts']
y = final_sample['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
print(f'X_train dimension: {X_train.shape}; y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}; y_train dimension: {y_test.shape}')
X_train dimension: (329,); y_train dimension: (329,) X_test dimension: (83,); y_train dimension: (83,)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from time import time
t0 = time()
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,1))
classifier = RandomForestClassifier()
pipeline = Pipeline ([
("cleaner", features()),
("vectorizer", tfidf_vector),
("classifier", classifier)
])
pipeline.fit(X_train, y_train)
print(f"It takes about {time() - t0:.1f} seconds")
It takes about 3.7 seconds
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
target_names = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
titles_options = [
("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')
]
for title, normalize in titles_options:
y_pred = pipeline.predict(X_test)
cm = confusion_matrix (y_test, y_pred)
disp = ConfusionMatrixDisplay(
cm
)
print(title)
print(disp.confusion_matrix)
plt.show()
Confusion matrix, without normalization [[ 0 2 1 0 4] [ 0 2 4 0 8] [ 0 0 7 2 11] [ 0 0 3 4 9] [ 0 0 7 2 17]] Normalized confusion matrix [[ 0 2 1 0 4] [ 0 2 4 0 8] [ 0 0 7 2 11] [ 0 0 3 4 9] [ 0 0 7 2 17]]
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names = target_names))
precision recall f1-score support
Extremely Negative 0.00 0.00 0.00 7
Negative 0.50 0.14 0.22 14
Neutral 0.32 0.35 0.33 20
Positive 0.50 0.25 0.33 16
Extremely Positive 0.35 0.65 0.45 26
accuracy 0.36 83
macro avg 0.33 0.28 0.27 83
weighted avg 0.37 0.36 0.32 83
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from time import time
pipeline = Pipeline ([
("cleaner", features()),
("vectorizer", TfidfVectorizer(tokenizer = spacy_tokenizer)),
("classifier", DecisionTreeClassifier())
])
parameters = {
'vectorizer__max_df': (0.5, 1.0),
'vectorizer__ngram_range': ((1, 1), (1,2)), # unigrams or bigrams
'vectorizer__use_idf': (True, False),
'classifier__max_features': ["auto"],
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("The pipeline contains:", [name for name, _ in pipeline.steps])
print("parameters are as follows:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print(f"It takes about {time() - t0:.1f} seconds.")
print()
print(f"Best score= {grid_search.best_score_:0.3f}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Performing grid search...
The pipeline contains: ['cleaner', 'vectorizer', 'classifier']
parameters are as follows:
{'classifier__max_features': ['auto'],
'vectorizer__max_df': (0.5, 1.0),
'vectorizer__ngram_range': ((1, 1), (1, 2)),
'vectorizer__use_idf': (True, False)}
Fitting 5 folds for each of 8 candidates, totalling 40 fits
It takes about 62.5 seconds.
Best score= 0.328
Best parameters set:
classifier__max_features: 'auto'
vectorizer__max_df: 1.0
vectorizer__ngram_range: (1, 2)
vectorizer__use_idf: True
from sklearn.metrics import plot_confusion_matrix
target_names = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
titles_options = [
("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')
]
for title, normalize in titles_options:
disp = plot_confusion_matrix(
grid_search, X_test, y_test,
display_labels= target_names,
cmap=plt.cm.Blues,
normalize=normalize
)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Confusion matrix, without normalization [[ 0 1 5 0 1] [ 1 2 7 2 2] [ 1 1 7 4 7] [ 1 0 5 7 3] [ 0 7 11 5 3]]
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Normalized confusion matrix [[0. 0.14285714 0.71428571 0. 0.14285714] [0.07142857 0.14285714 0.5 0.14285714 0.14285714] [0.05 0.05 0.35 0.2 0.35 ] [0.0625 0. 0.3125 0.4375 0.1875 ] [0. 0.26923077 0.42307692 0.19230769 0.11538462]]
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred, target_names = target_names))
precision recall f1-score support
Extremely Negative 0.00 0.00 0.00 7
Negative 0.18 0.14 0.16 14
Neutral 0.20 0.35 0.25 20
Positive 0.39 0.44 0.41 16
Extremely Positive 0.19 0.12 0.14 26
accuracy 0.23 83
macro avg 0.19 0.21 0.19 83
weighted avg 0.21 0.23 0.21 83
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = LogisticRegression(),
):
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
from sklearn.svm import SVC
pipeline = Pipeline ([
("cleaner", features()),
("vectorizer", TfidfVectorizer()),
("classifier", ClfSwitcher())
])
parameters = [
{
'vectorizer__tokenizer': [spacy_tokenizer],
'vectorizer__max_df': [1.0],
'vectorizer__norm': ('l1', 'l2'),
'vectorizer__stop_words': [None],
'classifier__estimator': [LogisticRegression(), ],
'classifier__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'classifier__estimator__max_iter': [50, 80],
},
{
'vectorizer__tokenizer': [spacy_tokenizer],
'vectorizer__max_df': [1.0],
'vectorizer__norm': ('l1', 'l2'),
'vectorizer__stop_words': [None],
'classifier__estimator': [SVC()],
},
{
'vectorizer__tokenizer': [spacy_tokenizer],
'vectorizer__max_df': [1.0],
'vectorizer__norm': ('l1', 'l2'),
'vectorizer__stop_words': [None],
'classifier__estimator': [RandomForestClassifier()],
},
]
print("Performing grid search...")
print("The pipeline contains:", [name for name, _ in pipeline.steps])
print("parameters are as follows:")
pprint(parameters)
t0 = time()
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs= -1, return_train_score=False, verbose=3)
gscv.fit(X_train, y_train)
print(f"It takes about {time() - t0:.3f} seconds")
Performing grid search...
The pipeline contains: ['cleaner', 'vectorizer', 'classifier']
parameters are as follows:
[{'classifier__estimator': [LogisticRegression()],
'classifier__estimator__max_iter': [50, 80],
'classifier__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'vectorizer__max_df': [1.0],
'vectorizer__norm': ('l1', 'l2'),
'vectorizer__stop_words': [None],
'vectorizer__tokenizer': [<function spacy_tokenizer at 0x7fd90a141670>]},
{'classifier__estimator': [SVC()],
'vectorizer__max_df': [1.0],
'vectorizer__norm': ('l1', 'l2'),
'vectorizer__stop_words': [None],
'vectorizer__tokenizer': [<function spacy_tokenizer at 0x7fd90a141670>]},
{'classifier__estimator': [RandomForestClassifier()],
'vectorizer__max_df': [1.0],
'vectorizer__norm': ('l1', 'l2'),
'vectorizer__stop_words': [None],
'vectorizer__tokenizer': [<function spacy_tokenizer at 0x7fd90a141670>]}]
Fitting 5 folds for each of 16 candidates, totalling 80 fits
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning:
40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/var/folders/vr/3692k8jj3h905qkblwr9h5xc0000gn/T/ipykernel_59726/2885128314.py", line 13, in fit
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/var/folders/vr/3692k8jj3h905qkblwr9h5xc0000gn/T/ipykernel_59726/2885128314.py", line 13, in fit
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
File "/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
warnings.warn(some_fits_failed_message, FitFailedWarning)
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [0.31020979 0.35291375 nan nan nan nan
0.31020979 0.35291375 nan nan nan nan
0.34662005 0.31328671 0.33752914 0.31934732]
warnings.warn(
It takes about 103.268 seconds
print(f"Best score= {gscv.best_score_:0.3f}")
best_parameters = gscv.best_estimator_.get_params()
all_classifiers =[]
for parameter in parameters:
all_classifiers.append(parameter['classifier__estimator'])
all_classifiers = [str(alg) for clf in all_classifiers for alg in clf]
print("All potential classifiers:")
pprint(all_classifiers)
idx = all_classifiers.index(str(best_parameters['classifier__estimator']))
print("Best parameters set:")
for param_name in sorted(parameters[idx].keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Best score= 0.353 All potential classifiers: ['LogisticRegression(max_iter=50)', 'SVC()', 'RandomForestClassifier()'] Best parameters set: classifier__estimator: LogisticRegression(max_iter=50) classifier__estimator__max_iter: 50 classifier__estimator__penalty: 'l2' vectorizer__max_df: 1.0 vectorizer__norm: 'l2' vectorizer__stop_words: None vectorizer__tokenizer: <function spacy_tokenizer at 0x7fd90a141670>
from sklearn.metrics import confusion_matrix
y_pred = gscv.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
ax = plt.axes()
sns.heatmap(cm, annot=True, fmt = 'd', xticklabels = target_names, yticklabels = target_names)
ax.set_title("Confusion matrix, without normalization")
plt.show()
y_pred = gscv.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
ax = plt.axes()
sns.heatmap(cm, annot=True, fmt = 'd', xticklabels = target_names, yticklabels = target_names)
ax.set_title("Confusion matrix, without normalization")
plt.show()
y_pred = gscv.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm = cm / cm.astype(np.float64).sum(axis=1)[:,None]
ax = plt.axes()
sns.heatmap(cm, annot=True, fmt = '.2f', xticklabels = target_names, yticklabels = target_names)
ax.set_title("Normalized confusion matrix")
plt.show()
from sklearn.metrics import classification_report
y_pred = gscv.predict(X_test)
print(classification_report(y_test, y_pred, target_names = target_names))
precision recall f1-score support
Extremely Negative 0.00 0.00 0.00 7
Negative 0.67 0.14 0.24 14
Neutral 0.30 0.15 0.20 20
Positive 1.00 0.12 0.22 16
Extremely Positive 0.34 0.88 0.49 26
accuracy 0.36 83
macro avg 0.46 0.26 0.23 83
weighted avg 0.48 0.36 0.28 83
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
tf_vectorizer = CountVectorizer(max_df=0.90, min_df=5, stop_words='english')
document_word_matrix_tf = tf_vectorizer.fit_transform(final_sample['Tweet_texts'])
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components= 5 ,random_state= 101)
lda.fit(document_word_matrix_tf)
LatentDirichletAllocation(n_components=5, random_state=101)
n_top_words = 15
for index,topic in enumerate(lda.components_):
print(f'THE TOP {n_top_words} WORDS FOR TOPIC #{index}')
print([tf_vectorizer.get_feature_names_out()[j] for j in topic.argsort()[-n_top_words:]])
print('#'*100)
THE TOP 15 WORDS FOR TOPIC #0 ['supermarket', 'help', 'service', 'delivery', 'stay', 'work', 'consumer', 'people', 'home', 'shopping', 'demand', 'online', 'covid', '19', 'food'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #1 ['buy', 'amp', 'toiletpaper', 'home', '2020', 'kingdom', 'stock', 'supermarket', 'pandemic', 'united', 'people', 'food', 'covid19', 'price', 'coronavirus'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #2 ['think', 'shelf', 'covid19', 'covid_19', 'thank', 'know', 'work', 'worker', 'people', 'supermarket', 'like', 'nt', 'coronavirus', 'grocery', 'store'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #3 ['supply', 'buy', 'live', 'food', 'just', 'time', 'mask', 'day', 'shopping', 'online', 'use', 'supermarket', 'sanitizer', 'hand', 'coronavirus'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #4 ['amid', 'oil', 'business', 'pandemic', 'crisis', 'india', 'good', 'nt', 'impact', 'coronavirus', 'consumer', 'amp', 'price', 'covid', '19'] ####################################################################################################
def plot_top_words(model, feature_names, n_top_words, title):
fig, axes = plt.subplots(3, 2, figsize=(30, 15), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
top_features = [feature_names[i] for i in top_features_ind]
weights = topic[top_features_ind]
ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(
f'Topic {topic_idx +1}',
fontdict={'fontsize': 30}
)
ax.invert_yaxis()
ax.tick_params(
axis='both',
which='major',
labelsize=20
)
for i in 'top right left'.split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.show()
tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')
from sklearn.decomposition import LatentDirichletAllocation
tfidf_vectorizer = TfidfVectorizer(
stop_words = 'english',
lowercase = True,
token_pattern = r'\b[a-zA-Z0-9]{2,}\b',
max_df = 0.9,
min_df = 5
)
document_word_matrix_tf_idf = tfidf_vectorizer.fit_transform(final_sample['Tweet_texts'])
lda_tf_idf = LatentDirichletAllocation(n_components= 5, random_state= 101)
lda_tf_idf.fit(document_word_matrix_tf_idf)
LatentDirichletAllocation(n_components=5, random_state=101)
for index,topic in enumerate(lda_tf_idf.components_):
print(f'THE TOP {n_top_words} WORDS FOR TOPIC #{index}')
print([tf_vectorizer.get_feature_names_out()[j] for j in topic.argsort()[-n_top_words:]])
print('#'*100)
THE TOP 15 WORDS FOR TOPIC #0 ['drive', 'consider', 'usa', 'coronavirus', 'amp', 'street', 'wonder', 'time', 'remember', 'houston', 'paper', 'covid', '19', 'work', 'consumer'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #1 ['low', 'life', 'offer', 'news', 'sell', 'pay', 'sanitizer', 'amp', 'coronavirus', 'power', 'great', 'covid', '19', 'fight', 'stop'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #2 ['think', 'kind', 'news', 'need', 'usa', '19', 'pay', 'wonder', 'line', 'toilet', 'hit', 'covid19', 'states', 'street', 'coronavirus'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #3 ['need', 'donate', 'panic', '19', 'shelf', 'covid', 'lockdown', 'fight', 'coronavirus', 'today', 'street', 'customer', 'local', 'oil', 'shop'] #################################################################################################### THE TOP 15 WORDS FOR TOPIC #4 ['urge', 'united', 'help', 'include', 'know', 'covid19', 'outside', 'lose', 'safe', 'delivery', 'grocery', 'free', 'market', 'coronavirus', 'power'] ####################################################################################################
tf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(lda_tf_idf, tf_feature_names, n_top_words, 'Topics in LDA model')
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(
lda_model = lda_tf_idf,
dtm = document_word_matrix_tf_idf,
vectorizer = tfidf_vectorizer
)
/opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. warnings.warn(msg, category=FutureWarning) /opt/anaconda3/envs/dsci614/lib/python3.8/site-packages/pyLDAvis/_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only default_term_info = default_term_info.sort_values(
[CV 5/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb02b7e9940>;, score=0.369 total time= 8.8s [CV 3/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=elasticnet, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb02b7e9e50>;, score=nan total time= 7.9s [CV 4/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=elasticnet, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb00fdabee0>;, score=nan total time= 7.7s [CV 2/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb02b0255e0>;, score=nan total time= 7.2s [CV 5/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb00faf9430>;, score=0.369 total time= 10.2s [CV 3/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=elasticnet, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb00faf94c0>;, score=nan total time= 10.7s [CV 1/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb00fb21160>;, score=nan total time= 8.7s [CV 4/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb02b025e50>;, score=nan total time= 6.9s [CV 2/5] END classifier__estimator=SVC(), vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb00fdabb80>;, score=0.288 total time= 7.9s [CV 5/5] END classifier__estimator=RandomForestClassifier(), vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fb00fdab790>;, score=0.385 total time= 9.0s [CV 2/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fc0ceb80>;, score=0.273 total time= 8.1s [CV 5/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80e082cc10>;, score=0.462 total time= 10.6s [CV 4/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80e082cb80>;, score=nan total time= 7.4s [CV 2/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fc0cee50>;, score=0.273 total time= 10.0s [CV 5/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fc0ce940>;, score=0.462 total time= 9.7s [CV 3/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=elasticnet, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fb932550>;, score=nan total time= 11.1s [CV 1/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fc0cee50>;, score=nan total time= 7.6s [CV 4/5] END classifier__estimator=SVC(), vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fb932670>;, score=0.348 total time= 8.3s [CV 2/5] END classifier__estimator=RandomForestClassifier(), vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fb932940>;, score=0.258 total time= 8.9s [CV 5/5] END classifier__estimator=RandomForestClassifier(), vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7f80fb932b80>;, score=0.385 total time= 6.3s [CV 1/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd06a8ef790>;, score=0.364 total time= 8.0s [CV 4/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd06a8ef790>;, score=0.258 total time= 10.7s [CV 3/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=50, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd06a8ef790>;, score=nan total time= 7.4s [CV 1/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd04f01bf70>;, score=0.364 total time= 9.9s [CV 4/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l2, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd06a13e940>;, score=0.258 total time= 9.8s [CV 2/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=elasticnet, vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd04d130940>;, score=nan total time= 11.2s [CV 5/5] END classifier__estimator=LogisticRegression(), classifier__estimator__max_iter=80, classifier__estimator__penalty=l1, vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd06a13e790>;, score=nan total time= 7.7s [CV 3/5] END classifier__estimator=SVC(), vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd04d130af0>;, score=0.348 total time= 8.2s [CV 1/5] END classifier__estimator=RandomForestClassifier(), vectorizer__max_df=1.0, vectorizer__norm=l1, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd04d130ee0>;, score=0.364 total time= 8.9s [CV 4/5] END classifier__estimator=RandomForestClassifier(), vectorizer__max_df=1.0, vectorizer__norm=l2, vectorizer__stop_words=None, vectorizer__tokenizer=<function spacy_tokenizer at 0x7fd04ed6e430>;, score=0.288 total time= 6.5s